library(tidyverse)
library(reshape2)
library(ggeasy)

net_dir = "/pastel/projects/speakeasy_dlpfc/SpeakEasy_singlenuclei/2nd_pass/snakemake-sn/results/"
expression_dir = "/pastel/projects/speakeasy_dlpfc/SpeakEasy_singlenuclei/2nd_pass/snakemake-sn/input/"
work_dir = "/pastel/Github_scripts/SpeakEasy_dlpfc/sn_dlpfc/2nd_pass/"

Top 5 hub genes

Hub genes are the most connected genes in a module so, here we list the top ones by module for all the cell-type-specific networks.

#############################
# Edges table
#############################

################################## Variables to set
# edge_weight2plot = 0.01 # Filter the edges by this value. This is the fraction of the top edge correlations. 0.1 = top 10%.
edge_weight_1 = 1 # We don't wanna plot the correlations = 1 
correlation_method = "pearson"
################################## Done

cell_list = c("mic", "ast", "oli", "end", "opc", "inh", "ext")

nodes_df = data.frame()
edges_df = data.frame()

for(j in cell_list){
  net_type = j
  
  # Expression data for a single set:
  exprData = read.table(paste0(expression_dir, net_type, ".txt"), header = T, stringsAsFactors = F, check.names = F) #cell_type or region_type ... 
  expr_matx = as.data.frame(exprData) # Residuals of the expression
  
  gene_modules = distinct(read.table(paste0(net_dir, net_type, "/geneBycluster.txt"), header = T, stringsAsFactors = F)) # clusters from SpeakEasy
  gene_modules_size = gene_modules %>% group_by(cluster_lv3) %>% summarise(n = n()) %>% filter(n >= 30)
  n_modules = nrow(gene_modules_size)
  
  for(i in 1:n_modules){
    #i=1
    mod2plot = gene_modules_size$cluster_lv3[i]
    modSize = gene_modules_size$n[i]
    
    # Select the expression values for the module of interest 
    to_plot = gene_modules$ensembl[gene_modules$cluster_lv3 == mod2plot]
    expr_matx_mod = expr_matx[to_plot, ]
    
    #Let's calculate the correlation matrix
    matx_cor = cor(t(expr_matx_mod), method = correlation_method) # must be gene by gene
    matx_cor_m = melt(matx_cor)
    colnames(matx_cor_m) = c("ensembl1", "ensembl2", "value")
    
    matx_cor_temp = matx_cor_m %>% left_join(gene_modules[, c("ensembl", "gene_name")], by=c("ensembl1" = "ensembl"))
    colnames(matx_cor_temp) = c("ensembl1", "ensembl2", "value", "from_node")
    
    # Complete correlation matrix 
    matx_cor_temp2 = matx_cor_temp %>% left_join(gene_modules[, c("ensembl", "gene_name")], by=c("ensembl2" = "ensembl"))
    colnames(matx_cor_temp2) = c("ensembl1", "ensembl2", "value", "from_node", "to_node")
    
    # Filter the edges based on correlation (correlation is used as weight here)
    edges_before_filtering = matx_cor_temp2[which(abs(matx_cor_temp2$value) != edge_weight_1), ]
    
    #order_of_edges = order(abs(edges_before_filtering$value), decreasing = T)
    #order_of_edges_filt = order_of_edges[1:round(length(order_of_edges)*edge_weight2plot)]
    #edges_filtered = edges_before_filtering[order_of_edges_filt,]
    
    # edges2keep = which(edges_before_filtering$value >= (median(abs(edges_before_filtering$value)))) # Get the edges > than the median  
    edges2keep = which(edges_before_filtering$value >= quantile(abs(edges_before_filtering$value), 3/4)) # Get the edges on the 3/4 quantiles 
    edges_filtered = edges_before_filtering[edges2keep,]
    
    colnames(edges_filtered) = c("fromNode", "toNode", "weight", "fromAltName", "toAltName") # match names for Cytoscape input
    
    # Remove NAs and duplicated ensembls 
    #edges_filtered = na.omit(edges_filtered)
    #edges_filtered = edges_filtered[! duplicated(edges_filtered), ]
    edges_filtered_df = edges_filtered
    edges_filtered_df$module = mod2plot
    edges_filtered_df$celltype = net_type
    
    edges_df = rbind(edges_df, edges_filtered_df) # Filtered edges table with all modules from all cell types 
    
    #############################
    # Nodes table 
    #############################
    nodes_table = gene_modules[ , c("ensembl", "cluster_lv3", "gene_name")]
    colnames(nodes_table) = c("nodeName", "nodeAttr", "altName") # Match for Cytoscape input 
    nodes_table = nodes_table[nodes_table$nodeAttr == mod2plot, ] # Only for the module of interest
    
    # Apply same filter to the nodes table
    nodes_filtered = nodes_table[nodes_table$nodeName %in% c(edges_filtered$fromNode,edges_filtered$toNode), ]
    
    # Add number of node connections in the filtered network to get the hub nodes
    nodes_filtered = inner_join(nodes_filtered, 
                                bind_rows(edges_filtered %>% group_by(fromNode) %>% dplyr::count() %>% rename(nodeName = fromNode), 
                                          edges_filtered %>% group_by(toNode) %>% dplyr::count() %>% rename(nodeName = toNode)) %>% 
                                  group_by(nodeName) %>% distinct %>% tally(wt = n))
    
    hub_genes_list = nodes_filtered[order(nodes_filtered$n, decreasing = T), ]$altName[1:5] # Hubs are the most connected nodes in a module (after filtering the edges)
    
    nodes_filtered_df = nodes_filtered[order(nodes_filtered$n, decreasing = T),]
    nodes_filtered_df$module_size = modSize
    nodes_filtered_df$celltype = net_type
    nodes_df = rbind(nodes_df, nodes_filtered_df) # Filtered nodes table with all modules from all cell types 
  }
}


save(nodes_df, file = paste0(work_dir, "nodes_df_sn.Rdata"))
# nodes_df %>% group_by(celltype,module_size) %>% summarise(n_genes = n()) %>% arrange(n_genes)
load(paste0(work_dir, "nodes_df_sn.Rdata"))
# Top hub genes 
nodes_df_top = nodes_df %>% 
  group_by(celltype,module_size) %>% 
  arrange(-n) %>% 
  slice_head(n = 5)

write.table(nodes_df_top, file = paste0(work_dir, "hub_genes_sn.txt"), sep = "\t", quote = F, row.names = F)
createDT(nodes_df_top)

Plot connections

We want to check if the network has a scale-free topology.

Scale-free: many nodes with only a few links. A few hubs with large number of links.

Random: most nodes have the same number of links. No highly connected nodes resulting in a Poison distribution.

ggplot(nodes_df, aes(x=n)) +
  geom_histogram(bins = 50, color="black", fill="white") +
  facet_wrap(~celltype, scales = "free") + 
  easy_labs(x = "Number of links (k)", y = "Number of nodes with k links") +
  theme_classic()

Session

sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Stream 8
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/libopenblasp-r0.3.15.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] ggeasy_0.1.3    reshape2_1.4.4  forcats_0.5.1   stringr_1.5.0  
##  [5] dplyr_1.1.3     purrr_1.0.2     readr_2.1.4     tidyr_1.3.0    
##  [9] tibble_3.2.1    ggplot2_3.4.4   tidyverse_1.3.1
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.11       lubridate_1.8.0   digest_0.6.33     utf8_1.2.4       
##  [5] R6_2.5.1          cellranger_1.1.0  plyr_1.8.9        backports_1.4.1  
##  [9] reprex_2.0.1      evaluate_0.23     highr_0.10        httr_1.4.7       
## [13] pillar_1.9.0      rlang_1.1.2       readxl_1.3.1      rstudioapi_0.15.0
## [17] jquerylib_0.1.4   DT_0.30           rmarkdown_2.25    labeling_0.4.3   
## [21] htmlwidgets_1.6.2 munsell_0.5.0     broom_1.0.5       compiler_4.1.2   
## [25] modelr_0.1.8      xfun_0.41         pkgconfig_2.0.3   htmltools_0.5.7  
## [29] tidyselect_1.2.0  fansi_1.0.5       crayon_1.5.2      tzdb_0.4.0       
## [33] dbplyr_2.4.0      withr_2.5.2       grid_4.1.2        jsonlite_1.8.7   
## [37] gtable_0.3.4      lifecycle_1.0.3   DBI_1.1.3         magrittr_2.0.3   
## [41] scales_1.2.1      cli_3.6.1         stringi_1.7.12    cachem_1.0.8     
## [45] farver_2.1.1      fs_1.6.3          xml2_1.3.5        bslib_0.5.1      
## [49] ellipsis_0.3.2    generics_0.1.3    vctrs_0.6.4       tools_4.1.2      
## [53] glue_1.6.2        hms_1.1.3         crosstalk_1.2.0   fastmap_1.1.1    
## [57] yaml_2.3.7        colorspace_2.1-0  rvest_1.0.2       knitr_1.45       
## [61] haven_2.4.3       sass_0.4.7